clear all
set more off
cap log close
cd "${master_dir}"
log using "${log_dir}/1a-PullandProcessPings.log", replace
***************************************************************************************************
* 
* Program: 1a-PullandProcessPings.do
* Purpose: Matche polling place GPS coordinates to buildings & pull pings within 100 meters
*          of those building centroids.
* Sections:
*     1. Iterate through state-level pings
*     2. Aggregate state files, match to census block groups, and create a master polling place file
*     3. Take buildings that are far from their geofences and just use the pin.
*     4. Create the master shapefile for matched buildings and their convex hulls 
*     5. Find all pings near a polling place before and during the 2016 election.
* Files Used:
*     1. 
* Files Created:
*     1. 1a-PullandProcessPings.log
* Notes: Install these packages before running:
*      ssc install geoinpoly
*      ssc install geodist
*      ssc install geonear
*
***************************************************************************************************

***************************************************************************************************
*  1. Iterate through state-level pings
***************************************************************************************************

* Set directories for building footprint shapefiles and polling place data file *

if "`c(username)'"=="kechen"  {
    global stata_shp_path "Q:\Building_Shapefiles\US_States_Stata"
    global data_path "Q:\Polling_Place_Project"
    global temp_file_path "F:\PollingPlaces_temp"
    global join_file_path "Q:\Join_Files"
    global polling_file_name "PollingPlaces2016_8_19_19"
    global cens_stata_shp_path "Q:\Census_Shapefiles\Block_Groups_2017_Stata"
}

#d ;
global states "Alabama Arizona Arkansas California Connecticut Delaware DistrictofColumbia Florida 
  Georgia Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts 
  Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey NewMexico 
  NewYork NorthCarolina NorthDakota Ohio Oklahoma Pennsylvania RhodeIsland SouthCarolina SouthDakota 
  Tennessee Texas Utah Vermont Virginia WestVirginia Wisconsin Wyoming" ;
#d cr

local state_list "${states}"

* Open the polling place pins file and find nearest building state by state, and save a temp state file *

use "${data_path}/${polling_file_name}.dta"

preserve

foreach st_name in `state_list' {

    keep if subinstr(state," ","",.) == "`st_name'"
    gen building_match = 0
    gen building_match_convex = 0
    gen dist_nearest_building_m = .
    gen building_area_m2 = .
    gen max_building_radius_m = .
    gen nearest_building_ID = .
    gen building_match_ID = .
    gen building_match_cnvhull_ID = .
    rename poll_id PollingPlace_ID
    
    geonear PollingPlace_ID lat lng using "${stata_shp_path}/`st_name'_Coordinates.dta", n(n _Y _X)
    replace dist_nearest_building_m = round(km_to_nid * 1000)
    drop km_to_nid
    rename nid n
    
    merge m:1 n using "${stata_shp_path}/`st_name'_Coordinates.dta", keep(master match) keepusing(_ID) nogen
    replace nearest_building_ID = _ID
    drop n _ID
    
    * Now find building rooftop matches and convex hull matches *
    
    geoinpoly lat lng using "${stata_shp_path}/`st_name'_Coordinates.dta"
    replace building_match = 1 if _ID != .
    replace building_match_convex = 1 if building_match == 1
    replace building_match_ID = _ID
    replace building_match_cnvhull_ID = _ID
    replace dist_nearest_building_m = (0 - dist_nearest_building_m) if building_match == 1
    drop _ID
    
    geoinpoly lat lng if building_match == 0 using "${stata_shp_path}/`st_name'_Coordinates_ConvexHulls.dta"
    replace building_match_convex = 1 if _ID != .
    replace building_match_cnvhull_ID = _ID if _ID != .
    
    * it is possible to match to more than one convex hull, break ties with nearest building *
    
    gegen temp = tag(PollingPlace_ID)
    gegen temp2 = min(temp), by(PollingPlace_ID)
    drop if temp2 == 0 & building_match_cnvhull_ID ~= nearest_building_ID
    drop temp*
    
    * now set _ID acording to our best guess of the building: a roof match, then hull match, then closest *
    
    replace _ID = building_match_ID
    replace _ID = building_match_cnvhull_ID if building_match == 0
    replace _ID = nearest_building_ID if building_match_convex == 0
    
    * Now merge on building characteristics *
    
    merge m:1 _ID using "${stata_shp_path}/`st_name'_Characteristics.dta", keep(master match) nogen
    replace building_area_m2 = round(Area_m2)
    replace max_building_radius_m = floor(max_building_radius + 1)
    drop Area_m2 max_building_radius
    
    
    * Now map building center to closest zip code *
    
    gen double ObsID = _n
    geonear ObsID building_cent_lat building_cent_lng using ///
      "${join_file_path}/ZipCodes_LatLong_2013Census.dta", neighbors(zip lat lng)
    
    drop ObsID 
    rename nid zip
    rename km_to_nid km_to_zip
    drop km_to_zip
    
    * get to local time *
    
    merge m:1 zip using "${join_file_path}/TimeZone_by_ZipCode.dta"
    drop if _merge ~= 3
    drop _merge
    
    gen byte HourOffset = .
    replace HourOffset = 5 if timezone == "America/New_York"
    replace HourOffset = 5 if timezone == "America/Detroit"
    replace HourOffset = 5 if timezone == "America/Indiana/Indianapolis"
    replace HourOffset = 5 if timezone == "America/Indiana/Marengo"
    replace HourOffset = 5 if timezone == "America/Indiana/Vevay"
    replace HourOffset = 5 if timezone == "America/Indiana/Vincennes"
    replace HourOffset = 5 if timezone == "America/Indiana/Winamac"
    replace HourOffset = 5 if timezone == "America/Kentucky/Louisville"
    replace HourOffset = 5 if timezone == "America/Kentucky/Monticello"
    
    replace HourOffset = 6 if timezone == "America/Chicago"
    replace HourOffset = 6 if timezone == "America/Indiana/Knox"
    replace HourOffset = 6 if timezone == "America/Indiana/Petersburg"
    replace HourOffset = 6 if timezone == "America/Indiana/Tell_City"
    replace HourOffset = 6 if timezone == "America/Menominee"
    replace HourOffset = 6 if timezone == "America/North_Dakota/Center"
    
    replace HourOffset = 7 if timezone == "America/Denver"
    replace HourOffset = 7 if timezone == "America/Boise"
    replace HourOffset = 7 if timezone == "America/Phoenix"
    replace HourOffset = 7 if timezone == "America/Shiprock"
    
    replace HourOffset = 8 if timezone == "America/Los_Angeles"
    
    tab timezone if HourOffset == .
    
    rename zip Zip_PollingPlace
    rename city City_PollingPlaceZip
    
    sort PollingPlace_ID
    compress
    save "${temp_file_path}/temp_PollingPlaces2016_`st_name'.dta", replace
    
    restore, preserve
}

***************************************************************************************************
*  2. Aggregate state files, match to census block groups, and create a master polling place file
***************************************************************************************************

* create the master polling place file and a geonear file *

clear
local state_list "${states}"

foreach st_name in `state_list' {
    append using "${temp_file_path}/temp_PollingPlaces2016_`st_name'.dta"
}


* merge on the 2017 census block group of the building centroid *

drop _ID
geoinpoly building_cent_lat building_cent_lng using "${cens_stata_shp_path}/Census_2017_Coordinates.dta"

merge m:1 _ID using "${cens_stata_shp_path}/Census_2017_Buildings.dta", keep(master match) ///
  keepusing(GISJOIN) nogen
rename GISJOIN gisjoin
drop _ID

gen byte Pseudo_Building = dist_nearest_building_m > 99

sort PollingPlace_ID
compress
save "${data_path}/PollingPlaces2016_w_TimeZones_and_Buildings_nocategories.dta", replace

* Building categories added in by using suggested classifications from OpenStreetMaps 
* Nominatim API and Google Maps API. Both APIs rendered erroneous suggestions in roughly 10,000
* cases, and these were categorized by hand. The final dataset with the categories is saved as
* PollingPlaces2016_w_TimeZones_and_Buildings_nocategories.dta

****************************************************************************************************  
* 3. Take buildings that are far from their geofences and just use the pin.
****************************************************************************************************  

* The idea is that google is probably missing the location, or google has hit, and microsoft just 
* doesn't have the shapefile. So we will construct a pseudo building, to catch the second case, 
* and if it was just a miss, the data will be dropped by the likely voter filter. 

replace building_cent_lat = lat if Pseudo_Building
replace building_cent_lng = lng if Pseudo_Building

* create the geonear file *

keep PollingPlace_ID building_cent_lat building_cent_lng
sort PollingPlace_ID
compress
save "${data_path}/PollingPlaces2016_GeoNear.dta", replace

****************************************************************************************************  
* 4. Create the master shapefile for matched buildings and their convex hulls 
****************************************************************************************************  

* make a temporary shapefile of each state's polling places then append them together *

clear
local state_list "${states}"

foreach st_name in `state_list' {
    use "${stata_shp_path}/`st_name'_Coordinates.dta", clear
    merge m:m _ID using "${temp_file_path}/temp_PollingPlaces2016_`st_name'.dta", ///
      keep(master match) keepusing(state) nogen
    keep if state != ""
    compress
    save "${temp_file_path}/temp_PollingGeofences2016_`st_name'.dta", replace
}
clear

foreach st_name in `state_list' {
    append using "${temp_file_path}/temp_PollingGeofences2016_`st_name'.dta"
}

sort state _ID n
rename _ID State_ID
gegen _ID = group(state State_ID)
sort _ID n
compress
save "${data_path}/PollingGeofences2016.dta", replace


* make a temporary shaprefile of each state's polling places' hulls then append them together *

clear

foreach st_name in `state_list' {
    use "${stata_shp_path}/`st_name'_Coordinates_ConvexHulls.dta", clear
    merge m:m _ID using "${temp_file_path}/temp_PollingPlaces2016_`st_name'.dta", ///
	  keep(master match) keepusing(state) nogen
    keep if state != ""
    compress
    save "${temp_file_path}/temp_PollingGeofences2016_ConvexHulls_`st_name'.dta", replace
}

clear

foreach st_name in `state_list' {
    append using "${temp_file_path}/temp_PollingGeofences2016_ConvexHulls_`st_name'.dta"
}

sort state _ID n
rename _ID State_ID
gegen _ID = group(state State_ID)
sort _ID n
compress
save "${data_path}/PollingGeofences2016_ConvexHulls.dta", replace

****************************************************************************************************  
* 5. Find all pings near a polling place before and durring the 2016 election
****************************************************************************************************  

* Set directories for building footprint shapefiles and polling place data file *

if "`c(username)'"=="kechen"  {
    global data_path "Q:\Polling_Place_Project"
    global pings_file_path "X:\Safegraph_Data_Stata\Pings"
}


* Election is Nov 8th, 2016 *

foreach m of num 1/1 {
    foreach d of num 1/5 {
        use "${pings_file_path}/1`m'_16/1`m'_16_All_SortGH_Day`d'.dta" 
        
        * filter to continental US *
        
        keep if (latitude > 24.7433195 & latitude < 49.3457868 & longitude > -124.7844079 & ///
          longitude < -66.9513812)
        
        * take each ping and compute its nearest building that we think was a polling place *
        
        gen long BaseID = _n
        geonear BaseID latitude longitude using "${data_path}/PollingPlaces2016_GeoNear.dta", ///
          neighbors(PollingPlace_ID building_cent_lat building_cent_lng) ///
		  genstub(NearestPollingPlace)
        
        keep if km_to_NearestPollingPlace < 0.1
        gen int Dist_to_PollingPlace_M = round(km_to_NearestPollingPlace * 1000)
        drop BaseID km_to_NearestPollingPlace
        
        * Now merge to mark if this is a Psuedobuilding *
        
        gen PollingPlace_ID = NearestPollingPlace
        merge m:1 PollingPlace_ID using ///
	      "${data_path}/PollingPlaces2016_w_TimeZones_and_Buildings.dta", ///
          keep(master match) keepusing(Pseudo_Building) nogen
        drop PollingPlace_ID
        
        * label if pings fall into the convex hull of the building *
        
        geoinpoly latitude longitude using "${data_path}/PollingGeofences2016_ConvexHulls.dta"
        gen byte Ping_in_ConvexHull = _ID ~= .
        drop _ID
        
        * fix for pseudobuildings *
        
        replace Ping_in_ConvexHull = 0 if Pseudo_Building == 1
        replace Ping_in_ConvexHull = 1 if Pseudo_Building == 1 & Dist_to_PollingPlace_M < 35
        
        compress
        save ///
	      "${data_path}/Polling_Places_Pings_100m_Radius/Pings_100m_PollingPlace_1`m'_`d'_16.dta", ///
		  replace
    }
}

log close
stop
